Calculate Highly Variable Features And Get mC Fraction AnnData#

Purpose#

The purpose of this step is to select highly variable features (HVF) and generate cell-by-feature methylation fraction matrix for clustering. The highly variable features are selected by comparing feature’s normalized dispersion among cells.

Input#

  • Filtered cell metadata;

  • MCDS files;

  • Feature list from basic feature filtering

Output#

  • cell-by-HVF methylation fraction matrix stored in AnnData format, e.g., mCH adata and mCG adata.

Import#

import pathlib
import pandas as pd
import dask
from ALLCools.mcds import MCDS
from ALLCools.dataset import ALLCoolsDataset

brain_dataset = ALLCoolsDataset(
    '/home/hanliu/cemba3c/projects/ALLCools/Brain/snmC-seq2/')

Load Data#

Metadata#

metadata = pd.read_csv(brain_dataset.metadata_path, index_col=0)
total_cells = metadata.shape[0]
print(f'Metadata of {total_cells} cells')
Metadata of 4875 cells
metadata.head()
AllcPath mCCCFrac mCGFrac mCGFracAdj mCHFrac mCHFracAdj FinalReads InputReads MappedReads DissectionRegion BamFilteringRate MappingRate Plate Col384 Row384 FANSDate Slice Sample
8E_M_10 /gale/raidix/rdx-4/mapping/8E/CEMBA190711-8E-1... 0.005505 0.744279 0.742863 0.020649 0.015228 2714916.0 6036476 4014048.0 8E 0.676354 0.664965 CEMBA190711-8E-1 19 0 190711 8 8E_190711
8E_M_100 /gale/raidix/rdx-4/mapping/8E/CEMBA190711-8E-1... 0.004702 0.723100 0.721792 0.012400 0.007735 3302547.0 7683706 5370970.0 8E 0.614888 0.699008 CEMBA190711-8E-2 1 2 190711 8 8E_190711
8E_M_1000 /gale/raidix/rdx-4/mapping/8E/CEMBA190711-8E-3... 0.005423 0.739960 0.738542 0.021733 0.016399 1369094.0 3658050 2381916.0 8E 0.574787 0.651144 CEMBA190711-8E-4 6 5 190711 8 8E_190711
8E_M_1002 /gale/raidix/rdx-4/mapping/8E/CEMBA190711-8E-3... 0.004117 0.745511 0.744459 0.010192 0.006101 4571390.0 11822434 8079217.0 8E 0.565821 0.683380 CEMBA190711-8E-4 7 5 190711 8 8E_190711
8E_M_1003 /gale/raidix/rdx-4/mapping/8E/CEMBA190711-8E-3... 0.005528 0.750461 0.749074 0.023083 0.017652 1334845.0 3479288 2337068.0 8E 0.571162 0.671709 CEMBA190711-8E-3 8 4 190711 8 8E_190711
var_dim = 'chrom100k'
use_features = pd.read_csv('FeatureList.BasicFilter.txt',
                           header=None,
                           index_col=0).index
use_features.name = var_dim

MCDS#

total_mcds = MCDS.open(brain_dataset.mcds_paths,
                       var_dim=var_dim,
                       use_obs=metadata.index).sel({var_dim: use_features})

Add mC Rate#

total_mcds.add_mc_frac(var_dim=var_dim,
                       normalize_per_cell=True,
                       clip_norm_value=10)

total_mcds
<xarray.MCDS>
Dimensions:            (chrom100k: 24106, cell: 4875, count_type: 2, mc_type: 2)
Coordinates:
  * chrom100k          (chrom100k) <U10 'chr1_30' 'chr1_31' ... 'chrX_1698'
  * cell               (cell) <U15 '8E_M_3022' '8E_M_2746' ... '8J_M_2288'
  * count_type         (count_type) <U3 'mc' 'cov'
  * mc_type            (mc_type) <U3 'CGN' 'CHN'
Data variables:
    chrom100k_chrom    (chrom100k) <U5 dask.array<chunksize=(1911,), meta=np.ndarray>
    chrom100k_da       (cell, chrom100k, mc_type, count_type) uint16 dask.array<chunksize=(1000, 1911, 2, 2), meta=np.ndarray>
    chrom100k_end      (chrom100k) int64 dask.array<chunksize=(1911,), meta=np.ndarray>
    chrom100k_start    (chrom100k) int64 dask.array<chunksize=(1911,), meta=np.ndarray>
    chrom100k_da_frac  (cell, chrom100k, mc_type) float64 dask.array<chunksize=(1000, 1911, 2), meta=np.ndarray>
Attributes:
    obs_dim:  cell
    var_dim:  chrom100k

If downsample#

downsample = None

if downsample and total_cells > downsample:
    # make a downsampled mcds
    print(f'Downsample cells to {downsample} to calculate HVF.')
    downsample_cell_ids = metadata.sample(downsample, random_state=0).index
    mcds = total_mcds.sel(
        {obs_dim: total_mcds.get_index(obs_dim).isin(downsample_cell_ids)})
else:
    mcds = total_mcds
load = True

if load and (mcds.get_index('cell').size <= 20000):
    # load the relevant data so the computation can be fater, watch out memory!
    mcds[f'{var_dim}_da_frac'].load()
/home/hanliu/mambaforge/envs/wmb/lib/python3.8/site-packages/dask/core.py:119: RuntimeWarning: invalid value encountered in true_divide
  return func(*(_execute_task(a, cache) for a in args))

The RuntimeWarning is expected (due to cov == 0). You can ignore it.

Highly Variable Feature#

mCH#

# use SVR based method
mch_hvf = mcds.calculate_hvf_svr(mc_type='CHN', n_top_feature=15000, plot=True)
Fitting SVR with gamma 0.0415, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number:     24106
Highly Variable Feature:  15000 (62.2%)

Save AnnData#

mch_adata = total_mcds.get_adata(mc_type='CHN', select_hvf=True)

mch_adata.write_h5ad(f'mCH.HVF.h5ad')

mch_adata
AnnData object with n_obs × n_vars = 4875 × 15000
    var: 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select'

mCG#

mcg_hvf = mcds.calculate_hvf_svr(mc_type='CGN', n_top_feature=15000, plot=True)
Fitting SVR with gamma 0.0415, predicting feature dispersion using mc_frac_mean and cov_mean.
Total Feature Number:     24106
Highly Variable Feature:  15000 (62.2%)

Save AnnData#

mcg_adata = total_mcds.get_adata(mc_type='CGN', select_hvf=True)

mcg_adata.write_h5ad(f'mCG.HVF.h5ad')

mcg_adata
AnnData object with n_obs × n_vars = 4875 × 15000
    var: 'CHN_mean', 'CHN_dispersion', 'CHN_cov', 'CHN_score', 'CHN_feature_select', 'CGN_mean', 'CGN_dispersion', 'CGN_cov', 'CGN_score', 'CGN_feature_select'